We want to look at real datasets to simulate realistic datasets.

data("cortical")
pathLabels = "~/Documents/BRAIN/gitrepo/brainData/analysis/results_160705/cluster_labels.txt"
labels <- read.table(pathLabels, stringsAsFactors = FALSE, sep='\t',
                     comment.char = "%")

prefilter <- cortical[,rownames(labels[!(labels$ClusterMergedLabel %in% c("-1", "m2", "m4", "m5", "m6")),])]
filter <- apply(assay(prefilter)>10, 1, sum)>=10

postfilter <- prefilter[filter,]
batch <- droplevels(colData(postfilter)$MD_c1_run_id)
qc <- as.matrix(colData(postfilter)[,1:14])
qcpca <- prcomp(qc, center=TRUE, scale.=TRUE)
mod <- model.matrix(~ batch + qcpca$x[, 1:2])
bio <- factor(labels[!(labels$ClusterMergedLabel %in% c("-1", "m2", "m4", "m5", "m6")), "ClusterMergedColor"])
postfilter = assay(postfilter)

vars <- rowVars(log1p(postfilter))
names(vars) <- rownames(postfilter)
vars <- sort(vars, decreasing = TRUE)
core <- postfilter[names(vars)[1:1000],]
detection_rate <- colSums(core>0)
coverage <- colSums(core)

We only focus on the cells that passed the QC filters (by the original authors) and that were part of the “core” clusters. We account for batch. We will color-code the cells by inferred cluster. Select cells, remove ERCC spike-ins, filter out the genes that do not have at least 10 counts in at least 10 cells. Number of retained genes:

print(sum(filter))
## [1] 7591
dim(core)
## [1] 1000  332

To speed up the computations, I will focus on the top 1,000 most variable genes. IS IT A GOOD IDEA?

mean = apply(log1p(assay(prefilter)), 1, function(x) mean(x[x!=0]))
plot(mean, rowMeans(assay(prefilter) == 0), xlim = c(1,11), ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              main = 'no filtering')

pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
par(mfrow = c(1,3))
smoothScatter(mean, rowMeans(assay(prefilter) == 0), xlim = c(0,8),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp =pal, main = 'no filtering')

mean = apply(log1p(postfilter), 1, function(x) mean(x[x!=0]))
smoothScatter(mean, rowMeans(postfilter == 0),xlim = c(1,11),
              nbin = 256, nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = 'after filtering')

mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
smoothScatter(mean, rowMeans(core == 0),xlim = c(1,11),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = '1,000 most variable genes')

Pink and glia cells have been removed. Fit data with K = 3, V intercepts in both Mu and Pi, commondispersion = FALSE, with X accounting for batch, qc.

print(system.time(zinb <- zinbFit(core, ncores = 3, K = 3, X = mod,
                                  commondispersion = FALSE)))
##    user  system elapsed 
## 693.297 313.229 425.589

True W

If we simulate W from real data, it will look like that.

pairs(zinb@W, col=cols[bio], pch=19, main="W\ncolor = analysis results 160705")

Gamma

gamma_mu = zinb@gamma_mu[1,]
gamma_pi = zinb@gamma_pi[1,]

df <- data.frame(gamma_mu = gamma_mu,
                 gamma_pi = gamma_pi,
                 detection_rate = detection_rate,
                 coverage = coverage)
pairs(df, pch=19, col=cols[bio])

print(cor(df, method="spearman"))
##                  gamma_mu   gamma_pi detection_rate   coverage
## gamma_mu        1.0000000 -0.2439368      0.5252101  0.9883636
## gamma_pi       -0.2439368  1.0000000     -0.8470762 -0.2861173
## detection_rate  0.5252101 -0.8470762      1.0000000  0.5482978
## coverage        0.9883636 -0.2861173      0.5482978  1.0000000
gamma = data.frame(gamma_mu = gamma_mu, gamma_pi = gamma_pi, celltype = bio)
gamma = melt(gamma)

ggplot(gamma, aes(x = value)) + 
  geom_histogram(aes(y = ..density..), bins = 20, col = 'gray') +
  geom_density(col= 'blue', size = .5) + 
  facet_grid(~ variable) + xlab('gamma')

ggplot(gamma, aes(x = variable, y = value)) + xlab('') + theme_bw() +
  geom_boxplot() + coord_flip() + facet_grid(~ variable, scales = 'free') +
  theme_bw() + ylab('gamma') + scale_x_discrete(breaks = c('', ''), drop=FALSE)

ggplot(gamma, aes(value, fill = celltype)) + geom_density(alpha = 0.2) +
  facet_grid(~ variable) + xlab('gamma') + theme_bw()

Beta

Note that we accounting for batch and qc in X, so beta has more than one row (M = 25 rows). Here, we look only at the first row of beta.

beta_mu = zinb@beta_mu[1,]
beta_pi = zinb@beta_pi[1,]

df <- data.frame(beta_mu_intercept = beta_mu,
                 beta_pi_intercept = beta_pi)
pairs(df, pch=19)

print(cor(df, method="spearman"))
##                   beta_mu_intercept beta_pi_intercept
## beta_mu_intercept         1.0000000        -0.3610809
## beta_pi_intercept        -0.3610809         1.0000000
beta = data.frame(beta_mu_intercept = beta_mu, beta_pi_intercept = beta_pi)
beta = melt(beta)
ggplot(beta, aes(x = value)) + 
  geom_histogram(aes(y = ..density..), bins = 100, col = 'gray') +
  geom_density(col= 'blue', size = .5) + 
  facet_grid(~ variable, scales = 'free') + xlab('beta')

ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
  geom_boxplot() + facet_grid(~ variable, scales = 'free') + coord_flip() +
  theme_bw() + ylab('beta') + scale_x_discrete(breaks = c('', ''), drop=FALSE)

# remove outliers
max = max(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
min = min(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
  geom_boxplot(outlier.shape = NA) +
  facet_grid(~ variable, scales = 'free') + coord_flip() +
  theme_bw() + ylab('beta removing outliers') +
  scale_x_discrete(breaks = c('', ''), drop=FALSE) +
  scale_y_continuous(limits = c(min, max))
## Warning: Removed 1314 rows containing non-finite values (stat_boxplot).

Alpha

pairs(t(zinb@alpha_mu), main = 'alpha_mu')

pairs(t(zinb@alpha_pi), main = 'alpha_pi')

df <- data.frame(alpha_mu_1 = zinb@alpha_mu[1, ],
                 alpha_mu_2 = zinb@alpha_mu[2, ],
                 alpha_mu_3 = zinb@alpha_mu[3, ],
                 alpha_pi_1 = zinb@alpha_pi[1, ],
                 alpha_pi_2 = zinb@alpha_pi[2, ],
                 alpha_pi_3 = zinb@alpha_pi[3, ])
pairs(df, pch=19)

print(cor(df, method="spearman"))
##              alpha_mu_1   alpha_mu_2   alpha_mu_3   alpha_pi_1
## alpha_mu_1  1.000000000 -0.002558535 -0.118975103 -0.300629485
## alpha_mu_2 -0.002558535  1.000000000 -0.007490167 -0.051727888
## alpha_mu_3 -0.118975103 -0.007490167  1.000000000  0.049573562
## alpha_pi_1 -0.300629485 -0.051727888  0.049573562  1.000000000
## alpha_pi_2 -0.020078504 -0.104977173 -0.067802696 -0.003056859
## alpha_pi_3  0.141028173 -0.090183390 -0.307602548 -0.040068676
##              alpha_pi_2  alpha_pi_3
## alpha_mu_1 -0.020078504  0.14102817
## alpha_mu_2 -0.104977173 -0.09018339
## alpha_mu_3 -0.067802696 -0.30760255
## alpha_pi_1 -0.003056859 -0.04006868
## alpha_pi_2  1.000000000  0.08323482
## alpha_pi_3  0.083234819  1.00000000

Dispersion

par(mfrow=c(1,1))
set = newSeqExpressionSet(core)
fq = betweenLaneNormalization(set, which = "full", offset = T)
disp = estimateDisp(counts(fq), offset = -offst(fq))
## Design matrix not provided. Switch to the classic mode.
plot(disp$tagwise.dispersion, 1/exp(zinb@zeta), ylab = 'zinb dispersion',
     xlab = 'edgeR tagwise dispersion', main = 'Dispersion')

print(cor(disp$tagwise.dispersion, 1/exp(zinb@zeta), method="spearman"))
## [1] 0.7247025
par(mfrow = c(1, 2))
plot(density(1/exp(zinb@zeta)), main = 'zinb dispersion')
plot(density(disp$tagwise.dispersion), main = 'edgeR dispersion')

par(mfrow = c(1, 2))
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
plot(mean, 1/exp(zinb@zeta), main = 'zinb', xlab = 'mean expression', ylab = 'dispersion', ylim = c(0, 9))
plot(mean, disp$tagwise.dispersion, main = 'edgeR',xlab = 'mean expression', ylab = 'dispersion', ylim = c(0, 9))

Estimated mu and pi

par(mfrow=c(1, 2))
detection_rate <- colMeans(core>0)
coverage <- colSums(core)
plot(rowMeans(getPi(zinb)), detection_rate, xlab="Average estimated Pi", ylab="Detection Rate for each cell", pch=19, col=cols[bio], ylim = c(0, 1))
plot(rowMeans(log1p(getMu(zinb))), log1p(coverage), xlab="Average estimated log Mu", ylab="log Coverage", pch=19, col=cols[bio])

par(mfrow=c(1, 2))
smoothScatter(mean, rowMeans(core == 0),xlim = c(2,8),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = 'Observed')
smoothScatter(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)), nbin = 256,
              nrpoints=Inf, pch="", cex=.7, xlim = c(2,8),
              xlab = "Estimated Mean Expression", main = 'Fitted',
              ylab = "Estimated Dropout Rate",ylim = c(0,1),
              colramp = pal)

mdplot(matrix(c(colMeans(log1p(getMu(zinb))), mean), ncol = 2),
       main = 'MD plot - mu')
abline(h=0, col = 'red')
mdplot(matrix(c(colMeans(getPi(zinb)), rowMeans(core == 0)), ncol = 2),
       main = 'MD plot - pi')
abline(h=0, col = 'red')